Here we investigate the outliers of a clustering on gene-expression generated from scRNA-seq data
First we load the with Seurat pre-processed 33k PBMC dataset from 10x Genomics. This data is analyzed using PCA and t-SNE on the variable genes. SNN (Shared nearest neighbour) is used to cluster the data in PC space.
load("../data/seurat_33k_merged")
It’s fairly strait forward to select outliers inside t-SNE space.
The following function is used to select t-SNE outliers:
tsne.neighbours <- function(data, max.distance = 2) {
pb <- txtProgressBar(min = 0, max = nrow(data), style = 3)
for (i in 1:nrow(data)) {
# Calculate neighbours within max.distance
neighbours <- data$tSNE_1 > (data[i,"tSNE_1"] - max.distance) &
data$tSNE_1 < (data[i,"tSNE_1"] + max.distance) &
data$tSNE_2 > (data[i,"tSNE_2"] - max.distance) &
data$tSNE_2 < (data[i,"tSNE_2"] + max.distance) &
(data$tSNE_1 - data[i,"tSNE_1"])^2 + (data$tSNE_2 - data[i,"tSNE_2"])^2 < max.distance^2
data[i,"neighbours"] <- sum(neighbours) -1 # -1 to exclude current cell
setTxtProgressBar(pb, i)
}
close(pb)
return(data)
}
system.time(pbmc33k.tsne.rot <- tsne.neighbours(data = pbmc33k.merged@tsne.rot))
It’s a simple function to determine to amount of neighbours within a certain distance (euclidian). The neighbours of each cell within in radius of 2 are caculated
If you select the cells that have less then 60 t-SNE neighbours.
plot(pbmc33k.merged@tsne.rot$tSNE_2~pbmc33k.merged@tsne.rot$tSNE_1,
pch=19, cex=0.4, col = ifelse(pbmc33k.tsne.rot$neighbours < 60 ,'red','green'))
Now let’s take a look at the number of gene and reads of the outliers. First, subset the data:
all.cells <- FetchData(pbmc33k.merged, c("nUMI", "nGene", "percent.mito"))
outlier.data <- FetchData(pbmc33k.merged, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.merged, cells.use = pbmc33k.tsne.rot$neighbours < 20))
Compare the number of genes, mitochondrial gene content and amount of reads to total dataset
par(mfrow=c(1,3))
boxplot(outlier.data$nGene, all.cells$nGene, names = c("Outlier nGene", "nGene total") )
boxplot(outlier.data$nUMI, all.cells$nUMI, names = c("Outlier nUMI", "nUMI total") )
boxplot(outlier.data$percent.mito, all.cells$percent.mito, names = c("Outlier mito %", "mito % total") )
par(mfrow=c(1,1))
Now let’s investigate the data of the Th-cells identified by the SNN algorithm and compare the outliers vs the non outliers
t.cell.data <- FetchData(pbmc33k.merged, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.merged, ident = c(11,13)))
t.cell.outliers.data <- FetchData(pbmc33k.merged, c("nUMI", "nGene", "percent.mito"),
cells.use = WhichCells(pbmc33k.merged, cells.use = pbmc33k.tsne.rot$neighbours < 20, ident = c(11,13)))
par(mfrow=c(1,3))
boxplot(t.cell.outliers.data$nGene, t.cell.data$nGene, names = c("Outlier nGene", "Total"))
boxplot(t.cell.outliers.data$nUMI, t.cell.data$nUMI, names = c("Outlier nUMI", "nUMI total") )
boxplot(t.cell.outliers.data$percent.mito, t.cell.data$percent.mito, names = c("Outlier mito %", "mito % total") )
par(mfrow=c(1,1))
th.cells <- SubsetData(pbmc33k.merged,
ident.use = c(11,13) )
th.cells.tsne.rot <- tsne.neighbours(data = th.cells@tsne.rot)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
plot(th.cells.tsne.rot$tSNE_2~th.cells.tsne.rot$tSNE_1,
pch=19, cex=0.4, col = ifelse(th.cells.tsne.rot$neighbours < 20 ,'red','green'))
th.cells.outliers <- SubsetData(th.cells,
cells.use = WhichCells(th.cells, cells.use = th.cells.tsne.rot$neighbours < 20))
th.cells.non.outliers <- SubsetData(th.cells,
cells.use = WhichCells(th.cells, cells.use = th.cells.tsne.rot$neighbours >= 20))
TSNEPlot(th.cells)
TSNEPlot(th.cells.outliers)
TSNEPlot(th.cells.non.outliers)
TSNEPlot(pbmc33k.merged)
VlnPlot(th.cells.outliers, c("MS4A1"), size.use = 0.2)
VlnPlot(th.cells.non.outliers, c("MS4A1"), size.use = 0.2)
th.outliers.ms4a1 <- th.cells.outliers@data["MS4A1",]
th.non.outliers.ms4a1 <- th.cells.non.outliers@data["MS4A1",]
# Fraction ms4a1 expressing cells in outliers and mean expression
sum(th.outliers.ms4a1 != 0) / length(th.outliers.ms4a1)
## [1] 0.1641026
mean(th.outliers.ms4a1)
## [1] 0.3334613
# Fraction ms4a1 expressing cells in outliers
sum(th.non.outliers.ms4a1 != 0) / length(th.non.outliers.ms4a1)
## [1] 0.02631091
mean(th.non.outliers.ms4a1)
## [1] 0.01604771
FeaturePlot(th.cells.outliers, c("MS4A1"),cols.use = c("lightgrey","blue"), nCol = 2)
FeaturePlot(th.cells.outliers, c("GNLY"),cols.use = c("lightgrey","blue"))
FeaturePlot(th.cells, c("MS4A1", "GNLY", "LYZ", "CD8A"), cols.use = c("lightgrey","blue"), nCol = 2)
FeaturePlot(th.cells.outliers, c("nUMI"),cols.use = c("lightgrey","blue"))
FeaturePlot(th.cells, c("CD3E"),cols.use = c("lightgrey","blue"))